# libraries
library(dplyr)
library(progress)
library(text2vec)

# load corpus
corpus <- readRDS("data/outputs/chavez_discourse_preprocessed.rds")

#============================================
# EXPLORE COLLOCATIONS
#============================================

# stopwords
stopwords_es <- unique(stopwords::data_stopwords_stopwordsiso$es, stopwords::data_stopwords_snowball$es)
stopwords_es <- lapply(stopwords_es, function(x) chartr("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", "aaaeiouaeioouuunAEIOUAEIOUN", x)) %>% unlist
stopwords_es <- gsub("[^a-zA-Z]", "", stopwords_es)  # keep only text
stopwords_es <- stopwords_es[stopwords_es!=""]

# creat vocab
tokens = space_tokenizer(corpus$text)
it = itoken(tokens, progressbar = FALSE)
#vocab = create_vocabulary(it, stopwords = stopwords_es)
vocab = create_vocabulary(it)
vocab <- prune_vocabulary(vocabulary = vocab, vocab_term_max = 10000)
#vocab <- prune_vocabulary(vocabulary = vocab, term_count_min = 10)

# create model
#model <- Collocations$new(vocabulary = vocab, collocation_count_min = 10, pmi = 5, gensim = 500, lfmd = -25)
model <- Collocations$new(vocabulary = vocab, collocation_count_min = 10)
model_fit <- model$fit(it, n_iter = 5)
model_fit <- model_fit %>% mutate(score = n_ij/(n_i*n_j))
colloc_subset <- model_fit %>% filter(n_ij>=100)
lfmd <- model_fit[,c("prefix", "suffix", "lfmd")] %>% arrange(-lfmd) %>% mutate(colloc = paste(prefix, suffix, sep = "_"))
llr <- model_fit[,c("prefix", "suffix", "n_ij", "llr")] %>% arrange(-llr) %>% mutate(colloc = paste(prefix, suffix, sep = "_"))
candidates <- intersect(llr$colloc[1:1000],lfmd$colloc[1:1000])
#candidates <- Reduce(intersect, list(llr$colloc[1:1000],lfmd$colloc[1:1000],score$colloc[1:1000]))

#============================================
# DEFINE COLLOCATIONS
#============================================

# collocations identified from transcripts
collocations <- c("cuarta republica", "quinta republica", "iv republica", "v republica",
                      "punto fijo", "partido socialista", "partido comunista", "movimiento v republica", "partidos aliados",
                      "partido socialista unido de venezuela",
                      "accion democratica", "clase media", "clase alta", "clase baja", "clases medias", 
                      "clases altas", "clases bajas", "carlos andres", "estados unidos", "tribunal supremo",
                      "consejo nacional electoral", "asamblea nacional", "america latina", "fuerza armada", "union europea",
                      "consejos comunales", "simon bolivar", "circulo bolivariano", "circulos bolivarianos",
                      "consejo comunal", "siglo xxi", "revolucion bolivariana",
                      "fuerzas armadas", "fuerte tiuna", "polo patriotico", "coordinadora democratica",
                      "fondo monetario", "carlos marx", "civico militar", "pueblos indigenas", "democracia representativa",
                      "democracia participativa",
                      "vuelvan caras", "barrio adentro", "amor mayor", "negro primero",
                      "sector privado", "sectores populares", "referendum revocatorio", "siglo xx", "siglo xix",
                      "cipriano castro", "sectores medios", "producto interno bruto", "situacion economica",
                      "crisis mundial", "crisis economica", "crisis financiera", "crisis politica", "crisis energetica",
                      "crisis alimentaria", "crisis historica", "crisis bancaria", "guerra psicologica", 
                      "guerra mediatica", "guerra sucia", "guerra fria", "guerra civil", "guerra federal",
                      "guerra interna", "guerra mundial", "guerra revolucionaria", "guerra economica",
                      "marta colomina", "napoleon bravo", "napoleon bonaparte", "gobiernos anteriores", 
                      "gobiernos locales", "gobiernos estadales", "gobiernos regionales", 
                      "gobierno revolucionario", "gobierno nacional", "gobierno bolivariano", "gobierno anterior",
                      "gobierno colombiano", "gobierno norteamericano", "gobierno cubano", "gobierno chino",
                      "gobierno regional", "gobierno local", "gobierno chino", "gobierno argentino", 
                      "gobierno imperialista", "estado mayor", "estado barinas", "estado plurinacional de bolivia",
                      "estados americanos", "estado nueva esparta", "estado lara", "estado anzoategui",
                      "estado burgues", "estado aragua", "estado monagas", "estado cojedes", "estado vargas",
                      "estado zulia", "estado falcon", "estado tachira", "estado portuguesa", "estado yaracuy",
                      "estado guarico", "estado miranda", "estado merida", "estado trujillo", "estado amazonas",
                      "estado carabobo", "estado delta amacuro", "estado sucre", "estado apure", "estado barinas",
                      "hugo chavez", "hugo chavez frias",
                      "estado nueva esparta", "presidente chavez", "presidente hugo chavez",
                      "presidente uribe", "presidente electo", "presidente evo morales", "banco central",
                      "presidente pastrana", "presidente cardoso", "presidente rafael correa", "presidente correa",
                      "presidente morales", "presidente alvaro uribe", "presidente fox", "presidente hugo rafael chavez frias",
                      "presidente hu jintao", "presidente nestor kirchner", "pueblo de venezuela",
                      "pueblo venezolano", "pueblos indigenas", "pueblo cubano", "pueblo boliviano",
                      "pueblo argentino", "pueblo colombiano", "pueblo ecuatoriano", "pueblo uruguayo",
                      "pueblo chino", "pueblo palestino", "pueblo dominicano", "pueblo chileno",
                      "pueblo hondureno", "pueblo haitiano", "pueblo arabe", "pueblo paraguayo", 
                      "pueblos aborigenes", "pueblos originarios", "pueblo peruano", "candidato chavez",
                      "revolucion bolivariana", "revolucion cubana", "proyecto bolivariano", "proyecto revolucionario",
                      "proyecto neoliberal", "proyecto socialista", "proyecto nacional simon bolivar", 
                      "proyecto constitucional", "proyecto productivo", "proyectos productivos", "proyecto piloto",
                      "proyecto estrategico", "proyectos sociales", "proyectos agricolas", "proyecto mariscal sucre",
                      "proyecto gasifero", "proyecto educativo", "proyecto turistico", "proyecto alternativo", "proyectos pilotos", 
                      "convenio cuba venezuela", "gran mision vivienda venezuela", "gran mision vivienda",
                      "gran mision", "grandes misiones", "grandes ciudades", "grandes empresas", 
                      "pequenas y medianas empresas", "grandes productores", "grandes medios", "grandes empresarios",
                      "grandes capitales", "grandes capitalistas", "grandes corporaciones", "grandes transnacionales",
                      "grandes consumidores", "pequenos productores", "pequenos empresarios", "pequena empresa",
                      "pequenas empresas", "pequena y mediana empresa", "pequenas y medianas empresas",
                      "medianos empresarios", "medianos productores", "candidato burgues", 
                      "grandes ligas", "paises desarrollados", "republica bolivariana", "republica dominicana",
                      "republica argentina", "republica popular china", "republica federativa del brasil", 
                      "republica islamica", "republica oriental del uruguay", "poder popular", "poder judicial", 
                      "poder ejecutivo", "poder legislativo", "poder economico", "poder constituyente", "poder politico",
                      "obras publicas", "poder comunal", "poder electoral", "poder moral", "poder adquisitivo",
                      "poderes publicos", "poderes locales", "nicolas maduro", "diosdallo cabello", "elias jaua",
                      "jorge rodriguez", "cilia flores", "delcy rodriguez", "erika farias", "jacqueline farias",
                      "rafael ramirez", "jorge giordani", "aristobulo isturiz", "jesse chacon", "industrias basica",
                      "ricardo menendez", "hector navarro", "nelson merentes", "industria petrolera",
                      "reservas internacionales", "grandes cacaos", "productores agricolas", 
                      "productor agricola","productores nacionales", "productos agricolas", "productor agropecuario",
                      "campos petroleros", "trabajadores petroleros", "trabajadores y trabajadoras",
                      "medios privados", "medios publicos", "medio siglo", "medio millon", "medio ambiente",
                      "medio oriente", "medios comunitarios", "medios alternativos",
                      "salas romer", "irene saez",
                      "campana electoral", "campana mediatica", "campana admirable", "campanas mediaticas",
                      "campanas electorales", "golpe militar", "golpe petrolero", "paro petrolero", 
                      "deuda externa", "deuda acumulada", "deuda laboral", "deuda publica", "deuda social",
                      "clases pupulares", "romulo betancourt", "adultos mayores", "constitucion bolivariana",
                      "primero justicia", "un nuevo tiempo", "causa r",
                      "republica de venezuela", "republica bolivariana de venezuela",
                      "democracia protagonica", "democracia verdadera", "democracia revolucionaria",
                      "democracia plena", "democracia economica", "democracia venezolana",
                      "falsa democracia", "verdadera democracia",
                      "proyecto constitucional", "reforma constitucional", "enmienda constitucional",
                      "clase_obrera", "proyecto de ley", 
                      "modelos economicos", "modelo colonial", "modelo alternativo", "modelo neoliberal",
                      "modelos politicos", "modelo capitalista", "modelo economico", "modelo educativo",
                      "modelo productivo", "modelo democratico", "modelo petrolero", "modelo politico", "modelo socialista",
                      "sistema economico", "sistema politico", "sistema democratico", "sistema capitalista",
                      "nuevo sistema", "nuevo modelo", "nuevos modelos", "nuevos sistemas", "nuevo orden",
                      "nueva constitucion", "nueva sociedad", "nueva geometria del poder", "nuevas instituciones",
                      "nueva institucionalidad",
                      "orden democratico", "orden politico", "orden economico")

# prepare collocations for replacement
collocations <- tibble(pattern = collocations)
collocations <- collocations %>% mutate(replacement = gsub(" ", "_", pattern), nchar = nchar(pattern)) %>% arrange(-nchar)
collocations$replacement[collocations$replacement == "iv_republica"] <- "cuarta_republica"
collocations$replacement[collocations$replacement == "v_republica"] <- "quinta_republica"
saveRDS(collocations, "data/outputs/collocations.rds")

#============================================
# REPLACE COLLOCATIONS
#============================================
pb <- progress_bar$new(total = nrow(collocations))
for(j in 1:nrow(collocations)){
  corpus$text <- gsub(paste0("\\<", collocations$pattern[j],"\\>"), collocations$replacement[j], corpus$text)
  pb$tick()
}

# check vocabulary
tokens <- space_tokenizer(corpus$text)
it <- itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(it)
vocab <- prune_vocabulary(vocab, term_count_min = 10) # keep only words that meet count threshold

# save outpu
saveRDS(corpus, "data/outputs/chavez_discourse_preprocessed_colloc.rds")

